package com.tcheung.common.utils;

import com.pandawork.core.log.LogClerk;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.poi.hwpf.extractor.WordExtractor;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

/**
 * 将word转换为Lucene的Document的工具类
 *
 * @author: zhangteng
 * @time: 2014/12/14 13:39
 */
public class LuceneDOCDocumentUtil {

    public static Document getDocument(File file) {
        if (!file.exists()) {
            return null;
        }
        WordExtractor wordExtractor = null;
        try {
            wordExtractor = new WordExtractor(new FileInputStream(file));
        } catch (IOException e) {
            LogClerk.errLog.error(e);
        }

        String title = file.getName();
        String path = file.getAbsolutePath();
        String content = wordExtractor.getText();

        Document document = new Document();
        document.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
        document.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED));
        document.add(new Field("path", path, Field.Store.YES, Field.Index.ANALYZED));

        return document;
    }
}
